Load packages:
library(tidyverse)
library(labelled)Resources:
num_obs <- 10000
# Generate standard normal distribution (default is mean = 0 and sd = 1)
set.seed(124)
stdnorm_dist <- rnorm(num_obs, mean = 0, sd = 1) # equivalent to rnorm(10)
# Generate normal distribution w/ custom mean and sd
set.seed(124)
norm_dist <- rnorm(num_obs, mean = 50, sd = 5)
# Generate left-skewed distribution
set.seed(124)
lskew_dist <- rbeta(num_obs, 5, 2)
# Generate right-skewed distribution
set.seed(124)
rskew_dist <- rbeta(num_obs, 2, 5)
# Create dataframe
df_generated_pop <- data.frame(stdnorm_dist, norm_dist, lskew_dist, rskew_dist)df_ipeds_pop %>% head()## # A tibble: 6 x 38
## instnm unitid opeid6 opeid control c15basic stabbr city zip locale region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
## <chr> <dbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <chr+lbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama A & M University 100654 001002 00100200 1 [Public] 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal 35762 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10128 1414 20160 1414 NA NA NA NA NA NA NA NA 1600 9240 3090 11542 21574 NA NA NA NA 25472 35504 NA NA NA NA
## 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public] 15 [Doctoral Universities: Highest Research Activity] AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 8100 0 19188 0 28978 0 62714 0 NA NA NA NA 1200 12307 5555 8100 19188 28978 62714 NA NA 27162 38250 48040 81776 NA NA
## 3 Amridge University 100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs] AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 11700 1300 11700 1300 NA NA NA NA NA NA NA NA 900 9600 1600 13000 13000 NA NA NA NA 25100 25100 NA NA NA NA
## 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Huntsville 35899 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10632 826 24430 826 NA NA NA NA NA NA NA NA 2120 10400 3994 11458 25256 NA NA NA NA 27972 41770 NA NA NA NA
## 5 Alabama State University 100724 001005 00100500 1 [Public] 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 7416 2740 14832 2740 NA NA NA NA NA NA NA NA 1600 7320 4228 10156 17572 NA NA NA NA 23304 30720 NA NA NA NA
## 6 The University of Alabama 100751 001051 00105100 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Tuscaloosa 35487-0100 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10780 0 30250 0 28978 0 62714 0 23610 0 43060 0 1000 13636 4600 10780 30250 28978 62714 23610 43060 30016 49486 48214 81950 42846 62296
df_generated_pop %>% head()## stdnorm_dist norm_dist lskew_dist rskew_dist
## 1 -1.38507062 43.07465 0.9172827 0.08271725
## 2 0.03832318 50.19162 0.7064826 0.29351743
## 3 -0.76303016 46.18485 0.8444117 0.15558827
## 4 0.21230614 51.06153 0.6694614 0.33053865
## 5 1.42553797 57.12769 0.3488487 0.65115131
## 6 0.74447982 53.72240 0.5401472 0.45985283
# Function to get sampling distribution (default: 1000 samples of size 200)
get_sampling_distribution <- function(data_vec, num_samples = 1000, sample_size = 200) {
sample_means <- vector(mode = 'numeric', num_samples)
for (i in 1:length(sample_means)) {
samp <- sample(data_vec, sample_size)
sample_means[[i]] <- mean(samp, na.rm = T)
}
sample_means
}
# Function to generate plot
plot_distribution <- function(data_df, data_var, group_var = NULL, group_cat = NULL, show_group_hist = F, sampling_dist = F, plot_title = '') {
# Prep dataframe
two_pop <- !is.null(group_var)
two_dist <- !is.null(group_var) && !sampling_dist
data_df[[data_var]] <- unclass(data_df[[data_var]]) # unclass haven_labelled
if (two_pop) {
data_df[[group_var]] <- unclass(data_df[[group_var]])
}
data_df <- data_df %>% filter(!is.na(get(data_var))) # remove NA rows
# If group_cat not provided, use 2 values from group_var
if (two_pop && is.null(group_cat)) {
group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
}
# Create population vector(s)
if (!two_pop) { # single population
data_vec1 <- data_df[[data_var]]
if (sampling_dist) { # sampling distribution
data_vec1 <- get_sampling_distribution(data_vec1)
}
} else { # two populations
data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
if (sampling_dist) { # sampling distribution
data_vec1_samp <- get_sampling_distribution(data_vec1)
data_vec2_samp <- get_sampling_distribution(data_vec2)
data_vec1 <- data_vec1_samp - data_vec2_samp
}
}
# Create statistics dataframe
if (!two_dist) {
lines_vec <- c('dotted')
stats_vec <- c(mean(data_vec1), median(data_vec1))
legend_title <- 'Statistics'
if (two_pop && sampling_dist) {
legend_title <- paste0('Statistics\n(', group_var, '=', group_cat[[1]], ' - ', group_var, '=', group_cat[[2]], ')')
}
} else {
lines_vec <- c('dotted', 'dotdash')
stats_vec <- c(mean(data_vec1), median(data_vec1), mean(data_vec2), median(data_vec2))
legend_title <- paste0('Statistics\n(', group_var, '=', group_cat[[1]], ' vs. ', group_var, '=', group_cat[[2]], ')')
}
stats_df <- data.frame(
pop = rep(lines_vec, each = 2),
stat = rep(c('blue', 'red'), times = if_else(two_dist, 2, 1)),
val = stats_vec
)
stats_df$pop <- factor(stats_df$pop, levels = c('dotted', 'dotdash'))
# Legend text
legend_text <- c(paste('Mean:', round(mean(data_vec1), 2),
'\nStd Dev:', round(sd(data_vec1), 2)),
paste('Median:', round(median(data_vec1), 2)))
if (two_dist) {
legend_text <- c(legend_text,
paste('Mean:', round(mean(data_vec2), 2),
'\nStd Dev:', round(sd(data_vec2), 2)),
paste('Median:', round(median(data_vec2), 2)))
}
# Plot distribution(s)
p <- ggplot() +
ggtitle(plot_title) + xlab('') + ylab('') +
geom_density(aes(x = data_vec1), alpha = 0.8)
if (!two_dist || show_group_hist) { # show histogram only if 1 pop or show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec1, y = ..density..), alpha = 0.4, position = 'identity')
}
if (two_dist) {
p <- p +
geom_density(aes(x = data_vec2), alpha = 0.8)
if (show_group_hist) { # show histogram only if show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec2, y = ..density..), alpha = 0.4, position = 'identity', fill = 'wheat4')
}
}
p <- p +
geom_vline(data = stats_df,
aes(xintercept = val, color = interaction(stat, pop), linetype = interaction(stat, pop)),
size = 0.6, alpha = 0.8) +
scale_color_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$stat)) +
scale_linetype_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$pop)) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
guides(col = guide_legend(ncol = if_else(two_dist, 2, 1)))
p
}
# Randomly generated data: standard normal
df_generated_pop %>% plot_distribution('stdnorm_dist')# Randomly generated data: normal
df_generated_pop %>% plot_distribution('norm_dist')# Randomly generated data: left-skewed
df_generated_pop %>% plot_distribution('lskew_dist')# Randomly generated data: right-skewed
df_generated_pop %>% plot_distribution('rskew_dist')# IPEDS data: tuitfee_grad_res
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res')# IPEDS data: tuitfee_grad_res by control
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control')# IPEDS data: tuitfee_grad_res by control, shade histogram
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control', show_group_hist = T)# IPEDS data: tuitfee_grad_res by control, custom order
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control', group_cat = c(2, 1))# IPEDS data: tuitfee_grad_res by control, custom categories
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control', group_cat = c(2, 3))# Take single random sample of size 200
df_generated_sample <- df_generated_pop[sample(nrow(df_generated_pop), 200), ]
df_ipeds_sample <- df_ipeds_pop[sample(nrow(df_ipeds_pop), 200), ]
# Randomly generated data: standard normal
df_generated_sample %>% plot_distribution('stdnorm_dist')# Randomly generated data: normal
df_generated_sample %>% plot_distribution('norm_dist')# Randomly generated data: left-skewed
df_generated_sample %>% plot_distribution('lskew_dist')# Randomly generated data: right-skewed
df_generated_sample %>% plot_distribution('rskew_dist')# IPEDS data: tuitfee_grad_res
df_ipeds_sample %>% plot_distribution('tuitfee_grad_res')# Randomly generated data: standard normal
df_generated_pop %>% plot_distribution('stdnorm_dist', sampling_dist = T)# Randomly generated data: normal
df_generated_pop %>% plot_distribution('norm_dist', sampling_dist = T)# Randomly generated data: left-skewed
df_generated_pop %>% plot_distribution('lskew_dist', sampling_dist = T)# Randomly generated data: right-skewed
df_generated_pop %>% plot_distribution('rskew_dist', sampling_dist = T)# IPEDS data: tuitfee_grad_res
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', sampling_dist = T)# IPEDS data: tuitfee_grad_res by control
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control', sampling_dist = T)# IPEDS data: tuitfee_grad_res by control, custom order
df_ipeds_pop %>% plot_distribution('tuitfee_grad_res', 'control', group_cat = c(2, 1), sampling_dist = T)library(patchwork)
# Use / to place plots 1 per row (stacked)
plot_distribution(df_generated_pop, 'norm_dist', plot_title = 'Population distribution') /
plot_distribution(df_generated_sample, 'norm_dist', plot_title = 'Single sample distribution') /
plot_distribution(df_generated_pop, 'norm_dist', sampling_dist = T,
plot_title = 'Sampling distribution')# Or use + and plot_layout()
plot_distribution(df_ipeds_pop, 'tuitfee_grad_res', 'control', group_cat = c(2, 1),
plot_title = 'Population distributions') +
plot_distribution(df_ipeds_sample, 'tuitfee_grad_res', 'control', group_cat = c(2, 1),
plot_title = 'Single sample distributions') +
plot_distribution(df_ipeds_pop, 'tuitfee_grad_res', 'control', group_cat = c(2, 1),
sampling_dist = T, plot_title = 'Sampling distribution differences') +
plot_layout(ncol = 1)# Function to generate t-distribution plot
plot_t_distribution <- function(data_vec, mu, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F) {
data_vec <- na.omit(data_vec)
# Calculate t-statistics
sample_size <- length(data_vec)
deg_freedom <- sample_size - 1
xbar <- mean(data_vec)
s <- sd(data_vec)
std_err <- s / sqrt(sample_size)
t <- (xbar - mu) / std_err
# Calculate critical value and p-value
if (alternative == 'less') { # left-tailed
cv_lower <- qt(p = alpha, df = deg_freedom, lower.tail = T)
cv_legend <- round(cv_lower, 2)
cv_legend2 <- round(cv_lower * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = T), 4)
} else if (alternative == 'greater') { # right-tailed
cv_upper <- qt(p = alpha, df = deg_freedom, lower.tail = F)
cv_legend <- round(cv_upper, 2)
cv_legend2 <- round(cv_upper * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = F), 4)
} else { # two-tailed
cv_lower <- qt(p = alpha / 2, df = deg_freedom, lower.tail = T)
cv_upper <- qt(p = alpha / 2, df = deg_freedom, lower.tail = F)
cv_legend <- str_c('\u00B1', round(cv_upper, 2))
cv_legend2 <- str_c(round(cv_lower * std_err + mu, 2), ' & ', round(cv_upper * std_err + mu, 2))
pval_half <- round(pt(q = t, df = deg_freedom, lower.tail = t < 0), 4)
pval <- str_c(pval_half, ' + ', pval_half, ' = ', 2 * pval_half)
}
# Plot t-distribution
p <- ggplot(data.frame(x = -c(-4, 4)), aes(x)) +
ggtitle(plot_title) + xlab('') + ylab('') +
stat_function(fun = dt, args = list(df = deg_freedom), xlim = c(-4, 4))
# Shade rejection region using critical value
if (alternative != 'greater') {
p <- p + geom_vline(aes(xintercept = cv_lower, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, cv_lower),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, if_else(alternative == 'two.sided', -abs(t), t)),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
if (alternative != 'less') {
p <- p + geom_vline(aes(xintercept = cv_upper, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(cv_upper, 4),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(if_else(alternative == 'two.sided', abs(t), t), 4),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
# Legend text
legend_text <- c('t-statistics / p-value', 'critical value / alpha')
if (stacked) {
legend_text <- c(str_c('t-statistics: ', round(t, 2),
'\n(p-value: ', str_extract(pval, '[\\d.-]+$'), ')'),
str_c('Critical value: ', cv_legend,
'\n(alpha: ', round(alpha, 2), ')'))
}
stats_text <- c(str_c('t-statistics: ', round(t, 2)),
str_c('SE: ', round(std_err, 2)),
str_c('p-value: ', pval),
str_c('Critical value: ', cv_legend),
str_c('alpha: ', round(alpha, 2)))
if (!stacked) {
p <- p +
annotate('text', size = 9*5/14, x = 4.84, y = 0.14, hjust = 0,
label = 'bold(Statistics)', parse = T) +
annotate('text', size = 8*5/14, x = 4.89, y = 0:4 * -0.015 + 0.12, hjust = 0,
label = stats_text)
}
# Label plot
p <- p +
geom_vline(aes(xintercept = t, color = 'tstat'),
linetype = 'dotted', size = 0.8, alpha = 0.8) +
scale_x_continuous(sec.axis = sec_axis(trans = ~ . * std_err + mu)) +
scale_color_manual(name = if_else(stacked, 'Statistics', 'Legend'),
breaks = c('tstat', 'cval'),
labels = legend_text,
values = c(tstat = 'blue', cval = 'red')) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
plot.margin = unit(c(5.5, if_else(stacked, 5.5, 30), 5.5, 5.5), 'pt'),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
coord_cartesian(xlim = c(-4, 4),
clip = 'off')
p
}
# H0: population mean of coa_grad_res is $29,000
# Ha: population mean of coa_grad_res is NOT $29,000 (two-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 29000)##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = 1.8026, df = 199, p-value = 0.07296
## alternative hypothesis: true mean is not equal to 29000
## 95 percent confidence interval:
## 28875.55 31774.47
## sample estimates:
## mean of x
## 30325.01
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000)# H0: population mean of coa_grad_res is $32,000
# Ha: population mean of coa_grad_res is NOT $32,000 (two-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 32000)##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -2.2788, df = 199, p-value = 0.02374
## alternative hypothesis: true mean is not equal to 32000
## 95 percent confidence interval:
## 28875.55 31774.47
## sample estimates:
## mean of x
## 30325.01
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 32000)# H0: population mean of coa_grad_res is $33,000
# Ha: population mean of coa_grad_res is LESS THAN $33,000 (left-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -3.6393, df = 199, p-value = 0.0001742
## alternative hypothesis: true mean is less than 33000
## 95 percent confidence interval:
## -Inf 31539.7
## sample estimates:
## mean of x
## 30325.01
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')# H0: population mean of coa_grad_res is $31,000
# Ha: population mean of coa_grad_res is GREATER THAN $31,000 (right-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -0.91831, df = 199, p-value = 0.8202
## alternative hypothesis: true mean is greater than 31000
## 95 percent confidence interval:
## 29110.32 Inf
## sample estimates:
## mean of x
## 30325.01
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Single sample distribution') +
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000,
plot_title = 'Sampling distribution under null', stacked = T) +
plot_layout(ncol = 1)t.test(formula = tuitfee_grad_res ~ control, mu = 0, data = df_ipeds_pop, subset = unclass(control) != 3)##
## Welch Two Sample t-test
##
## data: tuitfee_grad_res by control
## t = -14.621, df = 616.01, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8647.913 -6599.881
## sample estimates:
## mean in group 1 mean in group 2
## 10649.68 18273.58